#### TABLE A3: CHARACTERISTICS OF THE SAMPLE
#### Compare survey samples to LAPOP

rm(list = ls())
source("./2_code/00_setup.R")

#### LOAD SURVEY DATA ####

data1 <- fread(paste0(data_path, "data_study1.csv"), header = TRUE)
data2 <- fread(paste0(data_path, "data_study2.csv"), header = TRUE)
data3 <- fread(paste0(data_path, "data_study3.csv"), header = TRUE)


#### LOAD LAPOP DATA ####

lapop <- data.table(readstata13::read.dta13(paste0(data_path, "COL_LAPOP_merge_2004-2021_v1.2_w.dta"), 
                                            convert.factors = FALSE))

lapop18 <- lapop %>% filter(year == 2018)

#### RECODE LAPOP VARIABLES ####

# Gender
lapop18$male <- as.integer(lapop18$q1 == 1)

# Age
lapop18[, edad := as.numeric(q2)]

# Primary education
lapop18$primary_ed <- as.integer(lapop18$ed >= 5)

# Rural
lapop18$rural_indicator <- as.integer(lapop18$ur == 2)

# Ideology
lapop18$l1_rd <- cut(lapop18$l1, breaks = 5, labels = FALSE)

# Employment
lapop18 <- lapop18[!is.na(lapop18$colocup4a), ]
lapop18$colocup4a_recode <- recode(lapop18$colocup4a,
                                   "1" = "Working",
                                   "2" = "Looking",
                                   "3" = "Actively_Seeking",
                                   "4" = "Student",
                                   "5" = "Domestic",
                                   "6" = "Disability",
                                   "7" = "Retired",
                                   "8" = "Not_Looking")

dummy_variables <- model.matrix(~ colocup4a_recode - 1, data = lapop18)
colnames(dummy_variables) <- gsub("colocup4a_recode", "", colnames(dummy_variables))
lapop18 <- cbind(lapop18, dummy_variables)


#### RECODE STUDY 3 VARIABLES ####

# Government evaluation
data3$gov_evaluation_R <- recode(data3$gov_evaluation,
                                 "Nada" = 1,
                                 "Algo" = 2,
                                 "Poco" = 3,
                                 "Ni poco, ni bastante" = 4,
                                 "Bastante" = 5,
                                 "Mucho" = 6,
                                 "Muchísimo" = 7,
                                 .default = NA_real_)

# Education
data3$edurcord <- recode(data3$education,
                         "Primaria no terminada" = 0,
                         "Primaria" = 1,
                         "Secundaria no terminada" = 2,
                         "Secundaria" = 3,
                         "Bachicherato, Preparatoria o Escuela Técnica" = 4,
                         "Pregrado o Ingeniería" = 5,
                         "Posgrado" = 6,
                         .default = NA_real_)

data3$primary_ed <- as.integer(data3$edurcord >= 2)

# Rural
data3$residence_rcord <- recode(data3$residence,
                                "Una ciudad" = 1,
                                "En la periferia o alrededores de una ciudad" = 2,
                                "En un pueblo cercano a una zona rural" = 3,
                                "En una zona rural" = 4,
                                .default = NA_real_)

data3$rural_indicators <- as.integer(data3$residence_rcord == 4)

# Ideology
data3$political_ideology_rcord <- recode(data3$political_ideology,
                                         "Derecha" = 5,
                                         "Centro derecha" = 4,
                                         "Centro" = 3,
                                         "Centro izquierda" = 2,
                                         "Izquierda" = 1,
                                         .default = NA_real_)

# Political info
data3$political_info_rcord <- recode(data3$political_info,
                                     "Diariamente" = 1,
                                     "Algunas veces a la semana" = 2,
                                     "Algunas veces al año" = 3,
                                     "Algunas veces al mes" = 4,
                                     "Nuca" = 5,
                                     .default = NA_real_)

# Employment
data3$employed_recode <- recode(data3$employed,
                                "Trabajando" = "Working",
                                "No estoy trabajando en este momento pero tengo trabajo" = "Looking",
                                "Estoy buscando trabajo activamente" = "Actively_Seeking",
                                "Soy estudiante" = "Student",
                                "Me dedico a los quehaceres de mi hogar" = "Domestic",
                                "Estoy incapacitado permanentemente para trabajar" = "Disability",
                                "Estoy jubilado o pensionado" = "Retired",
                                "No trabajo y no estoy buscando trabajo" = "Not_Looking")

job_vars <- model.matrix(~ employed_recode - 1, data = data3)
colnames(job_vars) <- gsub("employed_recode", "", colnames(job_vars))
data3 <- cbind(data3, job_vars)


#### RECODE STUDY 2 VARIABLES ####

data2$edurcord <- recode(data2$schooling,
                         "Primaria no terminada" = 0,
                         "Primaria" = 1,
                         "Secundaria no terminada" = 2,
                         "Secundaria" = 3,
                         "Bachicherato, Preparatoria o Escuela Técnica" = 4,
                         "Pregrado o Ingeniería" = 5,
                         "Posgrado" = 6,
                         .default = NA_real_)

data2$primary_ed <- as.integer(data2$edurcord >= 2)


#### RECODE STUDY 1 VARIABLES ####

data1$edurcord <- recode(data1$schooling,
                         "Primaria no terminada" = 0,
                         "Primaria" = 1,
                         "Secundaria no terminada" = 2,
                         "Secundaria" = 3,
                         "Bachicherato, Preparatoria o Escuela Técnica" = 4,
                         "Pregrado o Ingeniería" = 5,
                         "Posgrado" = 6,
                         .default = NA_real_)

data1$primary_ed <- as.integer(data1$edurcord >= 2)


#### CREATE SUMMARY FUNCTIONS ####

calculate_stats <- function(variable) {
  stats <- c(Mean = mean(variable, na.rm = TRUE),
             Median = median(variable, na.rm = TRUE),
             SD = sd(variable, na.rm = TRUE),
             Min = min(variable, na.rm = TRUE),
             Max = max(variable, na.rm = TRUE),
             N = length(variable))
  return(stats)
}

create_summary_table <- function(variables) {
  stats_list <- lapply(variables, calculate_stats)
  summary_table <- data.frame(Variable = names(variables))
  
  for (stat in c("Mean", "Median", "SD", "Min", "Max", "N")) {
    summary_table[, stat] <- sapply(stats_list, function(x) x[stat])
  }
  
  return(summary_table)
}


#### SELECT VARIABLES FOR EACH TABLE ####

variables_LAPOP18 <- list(
  Male = lapop18$male,
  Age = lapop18$edad,
  `Primary Education` = lapop18$primary_ed,
  Rural = lapop18$rural_indicator,
  `Govrmt Evaluation` = lapop18$b3,
  Ideology = lapop18$l1_rd,
  `Political Info` = lapop18$gi0n,
  Working = lapop18$Working
)

variables_SURVEY3 <- list(
  Male = data3$male,
  Age = data3$edad,
  `Primary Education` = data3$primary_ed,
  Rural = data3$rural_indicators,
  `Govrmt Evaluation` = data3$gov_evaluation_R,
  Ideology = data3$political_ideology_rcord,
  `Political Info` = data3$political_info_rcord,
  Working = data3$Working
)

variables_SURVEY2 <- list(
  Male = data2$male,
  Age = data2$edad,
  `Primary Education` = data2$primary_ed,
  Working = data2$worked_dummy
)

variables_SURVEY1 <- list(
  Male = data1$male,
  Age = data1$edad,
  `Primary Education` = data1$primary_ed,
  Working = data1$worked_dummy
)


#### CREATE SUMMARY TABLES ####

summary_table_18 <- create_summary_table(variables_LAPOP18)
summary_table_SRV3 <- create_summary_table(variables_SURVEY3)
summary_table_SRV2 <- create_summary_table(variables_SURVEY2)
summary_table_SRV1 <- create_summary_table(variables_SURVEY1)


#### EXPORT TO LATEX ####

xtable1 <- xtable(summary_table_18)
xtable2 <- xtable(summary_table_SRV3)
xtable3 <- xtable(summary_table_SRV2)
xtable4 <- xtable(summary_table_SRV1)

table_list <- list(xtable1, xtable2, xtable3, xtable4)
attr(table_list, "subheadings") <- c("Lapop", "Study 3 (2023)", "Study 2 (2022)", "Study 1 (2021)")
attr(table_list, "message") <- c("Note: Descriptive statistics on sampled participants and Lapop survey.")

dig <- c(0, 0, 2, 0, 2, 0, 0, 0)

table_repre <- xtableList(table_list, caption = "Characteristics of the Sample", label = "repre", digits = dig)

latex_output <- print.xtableList(table_repre, 
                                 caption.placement = "top", 
                                 colnames.format = "multiple")

writeLines(latex_output, paste0(tables_path, "table_A3.tex"))
